library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidycensus)
library(sf)
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(gganimate)
https://medium.com/qri-io/taming-the-mtas-unruly-turnstile-data-c945f5f96ba0 was an important resource for this section.
http://web.mta.info/developers/developer-data-terms.html#data provided much of the data.
This script uses the files from turnstile_200201.txt all the way to turnstile_200523.txt from http://web.mta.info/developers/turnstile.html.
# turnstile_data <- read_delim("turnstile_200201.txt", ",")
# start <- as.Date("02-08-20",format="%m-%d-%y")
# end <- as.Date("05-23-20",format="%m-%d-%y")
# theDate <- start
# while (theDate <= end) {
# temp =read_delim(paste0("turnstile_", format(theDate,"%y%m%d"), ".txt"), ",")
# turnstile_data <- turnstile_data %>% bind_rows(temp)
# print(theDate)
# theDate <- theDate + 7
# }
#
# turnstile_data <- turnstile_data %>%
# mutate(DATE = mdy(DATE),
# ENTRIES = as.numeric(ENTRIES)) %>% rename(EXITS = `EXITS `) %>% mutate(EXITS = as.numeric(EXITS)) %>%
# mutate(DATETIME = ymd_hms(paste(DATE, TIME)))
# turnstile_data <- turnstile_data %>%
# mutate(id = paste(`C/A`, UNIT, SCP, DATE, TIME)) %>%
# mutate(unit_id = paste(`C/A`, UNIT, SCP)) %>%
# arrange(id)
Loading data instead of processing it, so it’s faster.
load("turnstile_data")
turnstile_data <- turnstile_data %>%
group_by(unit_id) %>%
mutate(net_entries = abs(ENTRIES - lag(ENTRIES, 1)),
net_exits = abs(EXITS - lag(EXITS, 1))) %>%
mutate(net_entries = replace(net_entries, net_entries > 10000, 0)) %>%
mutate(net_exits = replace(net_exits, net_exits > 10000, 0)) %>%
ungroup()
# group_by(unit_id) %>%
# mutate(net_entries = abs(lead(ENTRIES, 1) - ENTRIES),
# net_exits = abs(lead(EXITS, 1) - EXITS)) %>%
# mutate(net_entries = replace(net_entries, net_entries > 10000, 0)) %>%
# mutate(net_exits = replace(net_exits, net_exits > 10000, 0)) %>%
# ungroup()
Credits to Chris Whong for compiling the remote_complex_lookup data table.
station_booths <- readxl::read_excel("Remote-Booth-Station.xls")
remote_complex_lookup <- read_csv("remote_complex_lookup.csv")
## Parsed with column specification:
## cols(
## remote = col_character(),
## booth = col_character(),
## complex_id = col_double(),
## station = col_character(),
## line_name = col_character(),
## division = col_character()
## )
turnstile_data_joined <- turnstile_data %>% left_join(remote_complex_lookup,
by = c("UNIT" = "remote",
"C/A" = "booth"))
stations <- read_csv("Stations.csv")
## Parsed with column specification:
## cols(
## `Station ID` = col_double(),
## `Complex ID` = col_double(),
## `GTFS Stop ID` = col_character(),
## Division = col_character(),
## Line = col_character(),
## `Stop Name` = col_character(),
## Borough = col_character(),
## `Daytime Routes` = col_character(),
## Structure = col_character(),
## `GTFS Latitude` = col_double(),
## `GTFS Longitude` = col_double(),
## `North Direction Label` = col_character(),
## `South Direction Label` = col_character()
## )
turnstile_data_joined_stations <-
turnstile_data_joined %>%
left_join(
stations %>%
group_by(`Complex ID`) %>%
filter(row_number() == 1) %>%
ungroup(),
by = c("complex_id" = "Complex ID")
) %>%
filter(DIVISION != "PTH", DIVISION != "RIT")
turnstile_summaries <-
turnstile_data_joined_stations %>%
group_by(round_date(DATETIME, "4 hours")) %>%
filter(n() > 5) %>% summarize(entry_sum = sum(net_entries, na.rm = TRUE),
exit_sum = sum(net_exits, na.rm = TRUE)) %>%
rename(datetime = 1)
ggplot(
data = turnstile_summaries %>% group_by(the_date = date(datetime)) %>% summarize(
entry_sum = sum(entry_sum),
exit_sum = sum(exit_sum)
) %>% pivot_longer(ends_with("_sum"), names_to = 'sumtype', values_to = 'the_sum')
) +
geom_line(mapping = aes(
x = the_date,
y = the_sum,
group = sumtype,
color = sumtype
)) +
geom_point(mapping = aes(
x = the_date,
y = the_sum,
group = sumtype,
color = sumtype
)) +
labs(
title = "Total Daily Turnstile Entries and Exits",
x = "Date",
y = "Total Daily Count",
color = "Legend",
subtitle = paste(
"Orange Vertical Line is when State of Emergency was Declared in NYC (March 12)",
"Red Vertical Line is when Statewide Stay-At-Home Order was Enacted (March 22)",
sep = "\n"
)
) +
scale_color_manual(values = c("#7cae00", "#00bfc4"), labels = c("Entries", "Exits")) +
theme(legend.position = "right") +
scale_y_continuous(labels = scales::comma) +
geom_vline(
mapping = aes(xintercept = mdy("03-12-2020")),
color = "dark orange",
linetype = "dashed"
) +
geom_vline(
mapping = aes(xintercept = mdy("03-22-2020")),
color = "red",
linetype = "dashed"
)
Animated between hours.
entries_transition <- ggplot(data = turnstile_summaries) +
geom_line(mapping = aes(x = datetime, y = entry_sum)) +
geom_point(mapping = aes(x = datetime, y = entry_sum)) +
transition_states(
hour(datetime),
transition_length = 0,
state_length = 1
) +
labs(title = 'Total Entries into Subway System by Time of Day', subtitle = 'Hour of Day: {closest_state}', x = 'Date', y = 'Total System Entries')
exits_transition <- ggplot(data = turnstile_summaries) +
geom_line(mapping = aes(x = datetime, y = exit_sum)) +
geom_point(mapping = aes(x = datetime, y = exit_sum)) +
transition_states(
hour(datetime),
transition_length = 0,
state_length = 1
) +
labs(title = 'Total Exits from Subway System by Time of Day', subtitle = 'Hour of Day: {closest_state}', x = 'Date', y = 'Total System Exits')
# animate(entries_transition, height = 5, width = 5, units = "in", res = 150)
# anim_save("entries_transition.gif")
# animate(exits_transition, height = 5, width = 5, units = "in", res = 150)
# anim_save("exits_transition.gif")
ggplot(data = turnstile_summaries %>%
pivot_longer(ends_with("_sum"), names_to = 'sumtype', values_to = 'the_sum')) +
geom_line(mapping = aes(
x = datetime,
y = the_sum,
group = sumtype,
color = sumtype
)) +
geom_point(mapping = aes(
x = datetime,
y = the_sum,
group = sumtype,
color = sumtype
)) +
scale_y_continuous(labels = scales::comma) +
facet_wrap(vars(hour(datetime))) +
labs(
title = 'Total Turnstile Entries and Exits by Time of Day',
x = 'Date',
y = 'Total Count',
color = "Legend",
subtitle = paste(
"Orange Vertical Line is when State of Emergency was Declared in NYC (March 12)",
"Red Vertical Line is when Statewide Stay-At-Home Order was Enacted (March 22)",
sep = "\n"
)
) +
scale_color_manual(values = c("#7cae00", "#00bfc4"),
labels = c("Entries", "Exits")) +
theme(legend.position = "right") +
geom_vline(
mapping = aes(xintercept = mdy("03-12-2020") %>% as_datetime()),
color = "dark orange",
linetype = "dashed"
) +
geom_vline(
mapping = aes(xintercept = mdy("03-22-2020") %>% as_datetime()),
color = "red",
linetype = "dashed"
)
# get summary by hour for the entirety of the selected period
get_hourly_summaries <- function(periodtype, filterperiod, filterhour, summaryobject) {
if (periodtype == "date") {
temp = date(summaryobject$datetime)
} else if (periodtype == "week") {
temp = week(summaryobject$datetime)
} else if (periodtype == "month") {
temp = month(summaryobject$datetime)
}
time_filtered_summaryobject <- summaryobject %>% filter(
hour(datetime) == filterhour,
temp == filterperiod)
weekday_only_summaryobject <-
time_filtered_summaryobject %>% filter(!(wday(datetime, label = TRUE) %in% c("Sat", "Sun")))
weekend_only_summaryobject <-
time_filtered_summaryobject %>% filter((wday(datetime, label = TRUE) %in% c("Sat", "Sun")))
if (nrow(weekday_only_summaryobject) != 0) {
weekday_hourly_entry_avg = weekday_only_summaryobject$entry_sum %>% mean(na.rm = TRUE)
weekday_hourly_exit_avg = weekday_only_summaryobject$exit_sum %>% mean(na.rm = TRUE)
} else {
weekday_hourly_entry_avg = NA
weekday_hourly_exit_avg = NA
}
if (nrow(weekend_only_summaryobject) != 0) {
weekend_hourly_entry_avg = weekend_only_summaryobject$entry_sum %>% mean(na.rm = TRUE)
weekend_hourly_exit_avg = weekend_only_summaryobject$exit_sum %>% mean(na.rm = TRUE)
} else {
weekend_hourly_entry_avg = NA
weekend_hourly_exit_avg = NA
}
entry_avgs <- c(weekday_hourly_entry_avg, weekend_hourly_entry_avg)
exit_avgs <- c(weekday_hourly_exit_avg, weekend_hourly_exit_avg)
toReturn = tibble(
c("Weekday", "Weekend"),
c(filterperiod, filterperiod),
c(filterhour, filterhour),
entry_avgs,
exit_avgs,
c(periodtype, periodtype)
) %>% rename(DayType = 1, Period = 2, Hour = 3, EntryAvgs = 4, ExitAvgs = 5, PeriodType = 6)
return(toReturn)
}
get_percentage_of_period <- function(summaryobject, periodtype) {
if (periodtype == "month") {
periods <- unique(month(summaryobject$datetime))
} else if (periodtype == "week") {
periods <- unique(week(summaryobject$datetime))
} else if (periodtype == "date") {
periods <- unique(date(summaryobject$datetime))
}
hours <- c(0,4,8,12,16,20)
febavg <- NULL
for (currhour in hours) {
febavg <- febavg %>%
bind_rows(get_hourly_summaries("month", 2, currhour, summaryobject))
}
febavg <-
febavg %>% group_by(DayType) %>% mutate(
EntryAvgsTotal = sum(EntryAvgs),
ExitAvgsTotal = sum(ExitAvgs),
ProportionOfDailyEntry = EntryAvgs / EntryAvgsTotal,
ProportionOfDailyExit = ExitAvgs / ExitAvgsTotal
) %>%
ungroup() %>%
rename(FebEntryAvgs = EntryAvgs,
FebExitAvgs = ExitAvgs,
FebProportionOfDailyEntry = ProportionOfDailyEntry,
FebProportionOfDailyExit = ProportionOfDailyExit) %>%
select(-c(EntryAvgsTotal, ExitAvgsTotal, Period, PeriodType))
toReturn <- NULL
for (currperiod in periods) {
for (currhour in hours) {
toReturn <-
toReturn %>% bind_rows(get_hourly_summaries(periodtype, currperiod, currhour, summaryobject))
}
}
toReturn <-
toReturn %>% group_by(DayType, Period) %>% mutate(
EntryAvgsTotal = sum(EntryAvgs),
ExitAvgsTotal = sum(ExitAvgs),
ProportionOfDailyEntry = EntryAvgs / EntryAvgsTotal,
ProportionOfDailyExit = ExitAvgs / ExitAvgsTotal
) %>%
ungroup() %>%
left_join(febavg, by=c("DayType", "Hour")) %>%
drop_na(EntryAvgs) %>% mutate(
EntryAvgsChange = FebEntryAvgs - EntryAvgs,
ExitAvgsChange = FebExitAvgs - ExitAvgs,
PercentageOfDailyEntryChange = 100*(FebProportionOfDailyEntry - ProportionOfDailyEntry),
PercentageOfDailyExitChange = 100*(FebProportionOfDailyExit - ProportionOfDailyExit)
)
return(toReturn)
}
# as_date(18293) ==> "2020-02-01"
# NOTE: period is always stored as an integer
turnstile_date_percentages <- get_percentage_of_period(turnstile_summaries, "date")
turnstile_week_percentages <- get_percentage_of_period(turnstile_summaries, "week")
turnstile_month_percentages <- get_percentage_of_period(turnstile_summaries, "month")
EntryRatioToEntireDayAvg gives the ratio between average weekday/weekend entries for that hour and the total average weekday/weekend entries
ggplot(data = turnstile_week_percentages) +
geom_point(aes(
x = as_date("2020-01-01") + (7*turnstile_week_percentages$Period),
y = ProportionOfDailyEntry,
color = as.factor(Hour)
)) +
geom_line(aes(
x = as_date("2020-01-01") + (7*turnstile_week_percentages$Period),
y = ProportionOfDailyEntry,
color = as.factor(Hour),
group = as.factor(Hour)
)) +
facet_wrap(vars(DayType)) +
labs(
title = "Proportion of Daily Total Turnstile Entries by Time Interval",
x = "Month",
y = "Proportion of All Rides",
color = "Interval Ending At",
subtitle = paste(
"Orange Vertical Line is when State of Emergency was Declared in NYC (March 12)",
"Red Vertical Line is when Statewide Stay-At-Home Order was Enacted (March 22)",
sep = "\n"
)
) +
geom_vline(
mapping = aes(xintercept = mdy("03-12-2020")),
color = "dark orange",
linetype = "dashed"
) +
geom_vline(
mapping = aes(xintercept = mdy("03-22-2020")),
color = "red",
linetype = "dashed"
)
## Warning: Use of `turnstile_week_percentages$Period` is discouraged. Use
## `Period` instead.
## Warning: Use of `turnstile_week_percentages$Period` is discouraged. Use
## `Period` instead.
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
ggplot(data = turnstile_date_percentages) +
geom_point(aes(
x = as_date(Period),
y = ProportionOfDailyEntry,
color = as.factor(Hour)
)) +
geom_line(aes(
x = as_date(Period),
y = ProportionOfDailyEntry,
color = as.factor(Hour),
group = as.factor(Hour)
)) +
facet_wrap(vars(DayType)) +
labs(
title = "Proportion of Daily Total Turnstile Entries by Time Interval (Daily Averages)",
x = "Month",
y = "Proportion of All Rides",
color = "Interval Ending At",
subtitle = paste(
"Orange Vertical Line is when State of Emergency was Declared in NYC (March 12)",
"Red Vertical Line is when Statewide Stay-At-Home Order was Enacted (March 22)",
sep = "\n"
)
) +
geom_vline(
mapping = aes(xintercept = mdy("03-12-2020")),
color = "dark orange",
linetype = "dashed"
) +
geom_vline(
mapping = aes(xintercept = mdy("03-22-2020")),
color = "red",
linetype = "dashed"
)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
ggplot(data = turnstile_month_percentages %>% filter(Period %in% c(2, 4)) %>%
mutate(Period = month.abb[Period])) +
geom_point(aes(x = ((Hour - 2) + 24) %% 24,
y = ProportionOfDailyEntry,
color = as.factor(Period))) +
geom_line(aes(x = ((Hour - 2) + 24) %% 24,
y = ProportionOfDailyEntry,
color = Period,
group = as.factor(Period))) +
facet_grid(. ~ DayType) +
labs(
x = "Time Of Day",
y = "Proportion of All Rides",
title = "Proportion of Daily Total Turnstile Entries by Time Interval in February and April",
subtitle = "Points are at Time Interval Midpoints for Easier Visual Interpretation",
color = "Month",
caption = "For instance, 10am in this chart corresponds to 12pm in the previous chart"
)
view(turnstile_month_percentages %>% filter(Period == 2))
# curr_vars <- load_variables(2018, "acs5")
bronxzip <- as.character(c(10453, 10457, 10460, 10458, 10467, 10468, 10451, 10452, 10456, 10454, 10455, 10459, 10474, 10463, 10471, 10466, 10469, 10470, 10475, 10461, 10462, 10464, 10465, 10472, 10473))
brooklynzip <- as.character(c(11212, 11213, 11216, 11233, 11238, 11209, 11214, 11228, 11204, 11218, 11219, 11230, 11234, 11236, 11239, 11223, 11224, 11229, 11235, 11201, 11205, 11215, 11217, 11231, 11203, 11210, 11225, 11226, 11207, 11208, 11211, 11222, 11220, 11232, 11206, 11221, 11237))
manhattanzip <- as.character(c(10026, 10027, 10030, 10037, 10039, 10001, 10011, 10018, 10019, 10020, 10036, 10029, 10035, 10010, 10016, 10017, 10022, 10012, 10013, 10014, 10004, 10005, 10006, 10007, 10038, 10280, 10002, 10003, 10009, 10021, 10028, 10044, 10065, 10075, 10128, 10023, 10024, 10025, 10031, 10032, 10033, 10034, 10040))
queenszip <- as.character(c(11361, 11362, 11363, 11364, 11354, 11355, 11356, 11357, 11358, 11359, 11360, 11365, 11366, 11367, 11412, 11423, 11432, 11433, 11434, 11435, 11436, 11101, 11102, 11103, 11104, 11105, 11106, 11374, 11375, 11379, 11385, 11691, 11692, 11693, 11694, 11695, 11697, 11004, 11005, 11411, 11413, 11422, 11426, 11427, 11428, 11429, 11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421, 11368, 11369, 11370, 11372, 11373, 11377, 11378))
sizip <- as.character(c(10302, 10303, 10310, 10306, 10307, 10308, 10309, 10312, 10301, 10304, 10305, 10314))
nyczip <- as.character(c(bronxzip, brooklynzip, manhattanzip, queenszip, sizip))
zipcode_data <- get_acs(
geography = "zip code tabulation area",
variables = c(MedianIncome = "B19013_001",
TotalPopulation = "B01003_001",
TotalHousingUnits = "B25001_001",
TransitCommuteNum = "B08301_010",
AggregateTransportTime = "B08136_001"),
year = 2018
) %>% filter(GEOID %in% nyczip)
zipcode_data_spread <- zipcode_data %>%
pivot_wider(
id_cols = c(GEOID, NAME),
names_from = variable,
values_from = c(estimate)
)
zipcode_shapefiles <- get_acs(
geography = "zip code tabulation area",
variables = c(TotalPopulation = "B01003_001"),
geometry = TRUE,
keep_geo_vars = TRUE,
year = 2018
) %>% filter(GEOID %in% nyczip) %>% select(-c(estimate, moe, variable))
zipcode_data_joined <- zipcode_data_spread %>% inner_join(zipcode_shapefiles) %>%
mutate(PopDensity = TotalPopulation / (ALAND10 / 2589988),
TransitUsageRate = TransitCommuteNum / TotalPopulation,
AvgTransportTime = AggregateTransportTime / TotalPopulation)
zipcode_data_sf <-zipcode_data_joined %>% st_sf() %>% st_transform(4326)
zipcode_data_transformed <- zipcode_data_sf %>%
st_transform(2831)
turnstile_data_grouped <- turnstile_data_joined_stations %>%
group_by(complex_id, DATETIME) %>%
summarize(latitude = mean(`GTFS Latitude`),
longitude = mean(`GTFS Longitude`),
net_entries = sum(net_entries, na.rm = TRUE),
net_exits = sum(net_exits, na.rm = TRUE)) %>%
ungroup() %>%
filter(!is.na(latitude))
turnstile_data_sf_together <-
turnstile_data_grouped %>%
group_by(complex_id) %>%
summarize(latitude = mean(`latitude`),
longitude = mean(`longitude`),
net_entries = sum(net_entries, na.rm = TRUE),
net_exits = sum(net_exits), na.rm = TRUE) %>%
filter(!is.na(latitude)) %>%
st_as_sf(coords = c("longitude", "latitude"))
st_crs(turnstile_data_sf_together) <- 4326
turnstile_data_sf_together_transformed <-
turnstile_data_sf_together %>% st_transform(2831)
combined_table <-
st_join(turnstile_data_sf_together, zipcode_data_sf)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
combined_table_nas <-
combined_table %>% filter(is.na(GEOID)) %>% select(c(1:7)) %>%
st_join(zipcode_data_sf, join = st_nearest_feature)
## although coordinates are longitude/latitude, st_nearest_feature assumes that they are planar
combined_table <-
combined_table %>% drop_na(GEOID) %>% bind_rows(combined_table_nas)
## Warning in bind_rows_(x, .id): Vectorizing 'sfc_POINT' elements may not
## preserve their attributes
## Warning in bind_rows_(x, .id): Vectorizing 'sfc_POINT' elements may not
## preserve their attributes
turnstile_data_grouped <- turnstile_data_joined_stations %>%
group_by(complex_id, DATETIME) %>%
summarize(latitude = mean(`GTFS Latitude`),
longitude = mean(`GTFS Longitude`),
net_entries = sum(net_entries, na.rm = TRUE),
net_exits = sum(net_exits, na.rm = TRUE)) %>%
ungroup() %>%
filter(!is.na(latitude))
turnstile_data_grouped_febonly <-
turnstile_data_grouped %>% filter(month(DATETIME) == 2) %>%
mutate(IsWeekend = wday(DATETIME, label = TRUE) %in% c("Sat", "Sun")) %>%
mutate(DayType = if_else(IsWeekend, "Weekend", "Weekday")) %>%
select(-IsWeekend)
daily_turnstile_data_summarized_febonly <-
turnstile_data_grouped_febonly %>%
mutate(datetime_round = round_date(DATETIME, "4 hours"), DayType) %>%
# group_by(Hour = hour(datetime_round), DayType, complex_id) %>%
group_by(DayType, complex_id) %>%
summarize(
Feb_net_entries = mean(net_entries, na.rm = TRUE),
Feb_net_exits = mean(net_exits, na.rm = TRUE)
) %>% ungroup()
turnstile_data_grouped <-
turnstile_data_grouped %>%
mutate(IsWeekend = wday(DATETIME, label = TRUE) %in% c("Sat", "Sun")) %>%
mutate(DayType = if_else(IsWeekend, "Weekend", "Weekday")) %>%
select(-IsWeekend) %>%
mutate(datetime_round = round_date(DATETIME, "4 hours"), DayType) %>%
mutate(Hour = hour(datetime_round))
daily_turnstile_data_grouped_summarized <-
turnstile_data_grouped %>%
group_by(Month = month(DATETIME), DayType, complex_id ) %>% #, Hour) %>%
summarize(net_entries = mean(net_entries, na.rm = TRUE),
net_exits = mean(net_exits, na.rm = TRUE)) %>%
inner_join(daily_turnstile_data_summarized_febonly) %>%
mutate(
Diff_net_entries = net_entries - Feb_net_entries,
Diff_net_exits = net_exits - Feb_net_exits,
Diff_net_entries_percentage = 100 * (Diff_net_entries / Feb_net_entries),
Diff_net_exits_percentage = 100 * (Diff_net_exits / Feb_net_exits)
)
## Joining, by = c("DayType", "complex_id")
daily_turnstile_data_grouped_summarized_sf <- daily_turnstile_data_grouped_summarized %>% left_join(
stations %>%
group_by(`Complex ID`) %>%
filter(row_number() == 1) %>%
ungroup(),
by = c("complex_id" = "Complex ID")
) %>% rename(latitude = `GTFS Latitude`,
longitude = `GTFS Longitude`) %>%
filter(!is.na(latitude)) %>%
select(-c(`North Direction Label`, `South Direction Label`, Structure, `Daytime Routes`)) %>%
st_as_sf(coords = c("longitude", "latitude"))
st_crs(daily_turnstile_data_grouped_summarized_sf) <- 4326
daily_combined_table <-
st_join(daily_turnstile_data_grouped_summarized_sf, zipcode_data_sf)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
daily_combined_table_nas <-
daily_combined_table %>% filter(is.na(GEOID)) %>% select(c(1:7)) %>%
st_join(zipcode_data_sf, join = st_nearest_feature)
## although coordinates are longitude/latitude, st_nearest_feature assumes that they are planar
daily_combined_table <-
daily_combined_table %>% drop_na(GEOID) %>% bind_rows(daily_combined_table_nas)
## Warning in bind_rows_(x, .id): Vectorizing 'sfc_POINT' elements may not
## preserve their attributes
## Warning in bind_rows_(x, .id): Vectorizing 'sfc_POINT' elements may not
## preserve their attributes
daily_combined_table_sf <- daily_combined_table %>% st_as_sf()
st_crs(daily_combined_table_sf) <- 4326
station_complex_to_zip <- daily_combined_table %>% group_by(complex_id) %>%
summarize(Zip = first(GEOID), geometry = first(geometry), MedianIncome = first(MedianIncome),
PopDensity = first(PopDensity)) %>% ungroup()
ggplot(
data = daily_combined_table %>% filter(
net_entries > 10,
Feb_net_entries > 10,
Month == 4,
# Diff_net_entries_percentage < 0,
DayType == "Weekday"
) %>% mutate(
Borough = recode(
Borough,
Bx = "Bronx",
Bk = "Brooklyn",
M = "Manhattan",
Q = "Queens",
SI = "Staten Island"
)
)
) +
geom_point(aes(x = `MedianIncome`, y = Diff_net_entries_percentage, color = Borough)) +
labs(title = "Percentage Decline in Daily System Entries (Weekday) between February & April",
subtitle = "Percentage change, by station",
x = "Station Zip Code Median Income",
y = "Percentage Change")
ggplot(
data = daily_combined_table %>% filter(
net_entries > 10,
Feb_net_entries > 10,
Month == 4,
# Diff_net_entries_percentage < 0,
DayType == "Weekday"
) %>% mutate(
Borough = recode(
Borough,
Bx = "Bronx",
Bk = "Brooklyn",
M = "Manhattan",
Q = "Queens",
SI = "Staten Island"
)
)
) +
geom_point(aes(x = log(`MedianIncome`, 10), y = Diff_net_entries_percentage, color = Borough)) +
labs(title = "Percentage Decline in Daily System Entries (Weekday) between February & April",
subtitle = "Percentage change, by station",
x = "log10(Station Zip Code Median Income)",
y = "Percentage Change") +
geom_smooth(method='lm', aes(x = log(MedianIncome, 10), y = Diff_net_entries_percentage),
se = FALSE, color = "black")
ggplot(
data = daily_combined_table %>% filter(
net_entries > 10,
Feb_net_entries > 10,
Month == 4,
Diff_net_entries_percentage < -25,
DayType == "Weekend"
) %>% mutate(
Borough = recode(
Borough,
Bx = "Bronx",
Bk = "Brooklyn",
M = "Manhattan",
Q = "Queens",
SI = "Staten Island"
)
)
) +
geom_point(aes(x = `MedianIncome`, y = Diff_net_entries_percentage, color = Borough)) +
labs(title = "Percentage Decline in Daily System Entries (Weekend) between February and April",
subtitle = "Percentage change, by station",
x = "Station Zip Code Median Income",
y = "Percentage Change")
ggplot(
data = daily_combined_table_sf %>% filter(
# net_entries > 10,
# Feb_net_entries > 10,
Month == 4,!is.na(Diff_net_entries_percentage),
DayType == "Weekday"
) %>% mutate(
Borough = recode(
Borough,
Bx = "Bronx",
Bk = "Brooklyn",
M = "Manhattan",
Q = "Queens",
SI = "Staten Island"
)
)
) +
labs(title = "Percentage Decline in Daily System Entries (Weekday) between February and April",
subtitle = "Percentage change, by station",
fill = "Median Zip Code Income ($)",
color = "Percentage Change from February",
size = "Average Daily Entries in February") +
geom_sf(
data = zipcode_data_sf,
aes(fill = MedianIncome, geometry = geometry),
size = 0.25,
colour = "gray"
) +
scale_fill_gradient(
low = "#C9E3F2",
high = "#071469",
limits = c(0, 100000),
oob = scales::squish
) +
scale_color_gradient(
low = "#8C0003",
high = "pink",
oob = scales::squish
) +
geom_sf(
aes(
geometry = geometry,
color = Diff_net_entries_percentage,
size = Feb_net_entries
),
shape = 16
)
ggplot(
data = daily_combined_table_sf %>% filter(
# net_entries > 10,
# Feb_net_entries > 10,
Month == 4,!is.na(Diff_net_entries_percentage),
DayType == "Weekend"
) %>% mutate(
Borough = recode(
Borough,
Bx = "Bronx",
Bk = "Brooklyn",
M = "Manhattan",
Q = "Queens",
SI = "Staten Island"
)
)
) +
labs(title = "Percentage Decline in Daily System Entries (Weekend) between February and April",
subtitle = "Percentage change, by station",
fill = "Median Zip Code Income ($)",
color = "Percentage Change from February",
size = "Average Daily Entries in February") +
geom_sf(
data = zipcode_data_sf,
aes(fill = MedianIncome, geometry = geometry),
size = 0.25,
colour = "gray"
) +
scale_fill_gradient(
low = "#C9E3F2",
high = "#071469",
limits = c(0, 100000),
oob = scales::squish
) +
scale_color_gradient(
low = "#8C0003",
high = "pink",
limits = c(-100, -60),
oob = scales::squish
) +
geom_sf(
aes(
geometry = geometry,
color = Diff_net_entries_percentage,
size = Feb_net_entries
),
shape = 16
)
ggplot(
data = daily_combined_table_sf %>% filter(
# net_entries > 10,
# Feb_net_entries > 10,
Month == 4,!is.na(Diff_net_entries_percentage),
DayType == "Weekday"
) %>% mutate(
Borough = recode(
Borough,
Bx = "Bronx",
Bk = "Brooklyn",
M = "Manhattan",
Q = "Queens",
SI = "Staten Island"
)
)
) +
labs(title = "Average Decline in Daily System Exits (Weekday) between February and April",
subtitle = "Percentage change, by station",
fill = "Median Zip Code Income ($)",
color = "Percentage Change from February",
size = "Average Daily Exits in February") +
geom_sf(
data = zipcode_data_sf,
aes(fill = MedianIncome, geometry = geometry),
size = 0.25,
colour = "gray"
) +
scale_fill_gradient(
low = "#C9E3F2",
high = "#071469",
limits = c(0, 100000),
oob = scales::squish
) +
scale_color_gradient(
low = "dark green",
high = "light green",
limits = c(-100, -60),
oob = scales::squish
) +
geom_sf(
aes(
geometry = geometry,
color = Diff_net_exits_percentage,
size = Feb_net_exits
),
shape = 16
)
ggplot(data = daily_combined_table_sf %>% filter(
# net_entries > 10,
# Feb_net_entries > 10,
Month == 4,
!is.na(Diff_net_entries_percentage),
DayType == "Weekday"
)) + geom_point(aes(x = Diff_net_entries_percentage, y = Diff_net_exits_percentage,
color = Borough, size = MedianIncome), shape = 1) +
labs(title = "Decline in Average Daily Station Entries vs Decline in Average Daily Station Exits",
subtitle = "From February to April",
x = "Percentage Decline in Turnstile Entries",
y = "Percentage Decline in Turnstile Exits",
size = "Median Income")
turnstile_data_grouped_zip <-
turnstile_data_grouped %>% inner_join(station_complex_to_zip)
## Joining, by = "complex_id"
income_percentiles <-
quantile(unique(turnstile_data_grouped_zip$MedianIncome), c(0.2, 0.4, 0.6, 0.8))
turnstile_data_grouped_zip_20 <-
turnstile_data_grouped_zip %>% filter(MedianIncome <= income_percentiles["20%"])
turnstile_data_grouped_zip_80 <-
turnstile_data_grouped_zip %>% filter(MedianIncome >= income_percentiles["80%"])
turnstile_summaries_20 <-
turnstile_data_grouped_zip_20 %>%
group_by(datetime = datetime_round) %>%
summarize(
entry_sum = sum(net_entries, na.rm = TRUE),
exit_sum = sum(net_entries, na.rm = TRUE)
) %>% ungroup()
turnstile_summaries_80 <-
turnstile_data_grouped_zip_80 %>%
group_by(datetime = datetime_round) %>%
summarize(
entry_sum = sum(net_entries, na.rm = TRUE),
exit_sum = sum(net_entries, na.rm = TRUE)
) %>% ungroup()
turnstile_week_percentages_20 <- get_percentage_of_period(turnstile_summaries_20, "week")
turnstile_week_percentages_80 <- get_percentage_of_period(turnstile_summaries_80, "week")
turnstile_month_percentages_20 <- get_percentage_of_period(turnstile_summaries_20, "month")
turnstile_month_percentages_80 <- get_percentage_of_period(turnstile_summaries_80, "month")
ggplot(data = turnstile_week_percentages_20) +
geom_point(aes(
x = as_date("2020-01-01") + (7*turnstile_week_percentages_20$Period),
y = ProportionOfDailyEntry,
color = as.factor(Hour)
)) +
geom_line(aes(
x = as_date("2020-01-01") + (7*turnstile_week_percentages_20$Period),
y = ProportionOfDailyEntry,
color = as.factor(Hour),
group = as.factor(Hour)
)) +
facet_wrap(vars(DayType)) +
labs(
title = "Proportion of Daily Total Turnstile Entries by Time Interval (20th Percentile Income)",
x = "Month",
y = "Proportion of All Rides",
color = "Interval Ending At",
subtitle = paste(
"Orange Vertical Line is when State of Emergency was Declared in NYC (March 12)",
"Red Vertical Line is when Statewide Stay-At-Home Order was Enacted (March 22)",
sep = "\n"
)
) +
geom_vline(
mapping = aes(xintercept = mdy("03-12-2020")),
color = "dark orange",
linetype = "dashed"
) +
geom_vline(
mapping = aes(xintercept = mdy("03-22-2020")),
color = "red",
linetype = "dashed"
)
## Warning: Use of `turnstile_week_percentages_20$Period` is discouraged. Use
## `Period` instead.
## Warning: Use of `turnstile_week_percentages_20$Period` is discouraged. Use
## `Period` instead.
ggplot(data = turnstile_week_percentages_80) +
geom_point(aes(
x = as_date("2020-01-01") + (7*turnstile_week_percentages_80$Period),
y = ProportionOfDailyEntry,
color = as.factor(Hour)
)) +
geom_line(aes(
x = as_date("2020-01-01") + (7*turnstile_week_percentages_80$Period),
y = ProportionOfDailyEntry,
color = as.factor(Hour),
group = as.factor(Hour)
)) +
facet_wrap(vars(DayType)) +
labs(
title = "Proportion of Daily Total Turnstile Entries by Time Interval (80th Percentile Income)",
x = "Month",
y = "Proportion of All Rides",
color = "Interval Ending At",
subtitle = paste(
"Orange Vertical Line is when State of Emergency was Declared in NYC (March 12)",
"Red Vertical Line is when Statewide Stay-At-Home Order was Enacted (March 22)",
sep = "\n"
)
) +
geom_vline(
mapping = aes(xintercept = mdy("03-12-2020")),
color = "dark orange",
linetype = "dashed"
) +
geom_vline(
mapping = aes(xintercept = mdy("03-22-2020")),
color = "red",
linetype = "dashed"
)
## Warning: Use of `turnstile_week_percentages_80$Period` is discouraged. Use
## `Period` instead.
## Warning: Use of `turnstile_week_percentages_80$Period` is discouraged. Use
## `Period` instead.
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
ggplot(data = turnstile_month_percentages_20 %>% filter(Period %in% c(2, 4)) %>%
mutate(Period = month.abb[Period])) +
geom_point(aes(x = ((Hour - 2) + 24) %% 24,
y = ProportionOfDailyEntry,
color = as.factor(Period))) +
geom_line(aes(x = ((Hour - 2) + 24) %% 24,
y = ProportionOfDailyEntry,
color = Period,
group = as.factor(Period))) +
facet_grid(. ~ DayType) +
labs(
x = "Time Of Day",
y = "Proportion of All Rides",
title = "Proportion of Daily Total Turnstile Entries by Time Interval in February and April",
subtitle = "Below 20th Percentile Income\nPoints are at Time Interval Midpoints for Easier Visual Interpretation",
color = "Month",
caption = "For instance, 10am in this chart corresponds to 12pm in the previous chart"
)
ggplot(data = turnstile_month_percentages_80 %>% filter(Period %in% c(2, 4)) %>%
mutate(Period = month.abb[Period])) +
geom_point(aes(x = ((Hour - 2) + 24) %% 24,
y = ProportionOfDailyEntry,
color = as.factor(Period))) +
geom_line(aes(x = ((Hour - 2) + 24) %% 24,
y = ProportionOfDailyEntry,
color = Period,
group = as.factor(Period))) +
facet_grid(. ~ DayType) +
labs(
x = "Time Of Day",
y = "Proportion of All Rides",
title = "Proportion of Daily Total Turnstile Entries by Time Interval in February and April",
subtitle = "Above 80th Percentile Income\nPoints are at Time Interval Midpoints for Easier Visual Interpretation",
color = "Month",
caption = "For instance, 10am in this chart corresponds to 12pm in the previous chart"
)